Data Inspection¶
In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm
df_list = list()
chunk_iter = pd.read_csv(
"../data/Total_Data_10Y_Top24.csv",
chunksize=100000,
dtype = {"CANCELLATION_CODE": str}
)
for chunk in tqdm(chunk_iter):
df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:29, 4.45it/s]
Out[2]:
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 28.0 | 1214.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 13.0 | 945.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | 6.0 | 1341.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 7.0 | 1159.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 6.0 | 2317.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 24 columns
In [3]:
df.describe()
Out[3]:
| ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEP_TIME | DEP_DELAY | TAXI_OUT | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.271494e+07 | 1.271486e+07 | 1.271048e+07 | 1.270744e+07 | 1.270744e+07 | 1.268139e+07 | 1.292656e+07 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 |
| mean | 1.298824e+06 | 3.161338e+04 | 1.298812e+06 | 3.161370e+04 | 1.332409e+03 | 1.127152e+01 | 1.722694e+01 | 8.678440e+00 | 1.471801e+03 | 5.409690e+00 | 1.675551e-02 | 2.041558e+01 | 2.650336e+00 | 1.520350e+01 | 1.268451e-01 | 2.413592e+01 |
| std | 1.453419e+05 | 1.168922e+03 | 1.453231e+05 | 1.168763e+03 | 5.192380e+02 | 4.514114e+01 | 9.141093e+00 | 6.704624e+00 | 5.554134e+02 | 4.738376e+01 | 1.283540e-01 | 5.935956e+01 | 2.208897e+01 | 3.191662e+01 | 3.210375e+00 | 5.163883e+01 |
| min | 1.039705e+06 | 3.019400e+04 | 1.039705e+06 | 3.019400e+04 | 1.000000e+00 | -2.340000e+02 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | -2.380000e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.129806e+06 | 3.046600e+04 | 1.129806e+06 | 3.046600e+04 | 9.060000e+02 | -5.000000e+00 | 1.200000e+01 | 5.000000e+00 | 1.052000e+03 | -1.400000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 1.289208e+06 | 3.145400e+04 | 1.289208e+06 | 3.145400e+04 | 1.324000e+03 | -1.000000e+00 | 1.500000e+01 | 7.000000e+00 | 1.515000e+03 | -5.000000e+00 | 0.000000e+00 | 3.000000e+00 | 0.000000e+00 | 2.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 1.410702e+06 | 3.245700e+04 | 1.410702e+06 | 3.245700e+04 | 1.751000e+03 | 9.000000e+00 | 2.000000e+01 | 1.000000e+01 | 1.930000e+03 | 9.000000e+00 | 0.000000e+00 | 2.000000e+01 | 0.000000e+00 | 1.900000e+01 | 0.000000e+00 | 2.800000e+01 |
| max | 1.501606e+06 | 3.481900e+04 | 1.501606e+06 | 3.481900e+04 | 2.400000e+03 | 3.695000e+03 | 2.270000e+02 | 1.419000e+03 | 2.400000e+03 | 3.680000e+03 | 1.000000e+00 | 3.359000e+03 | 2.692000e+03 | 1.511000e+03 | 9.870000e+02 | 3.581000e+03 |
In [4]:
df.FL_DATE.max()
Out[4]:
'2024-06-30'
In [5]:
df.nunique()
Out[5]:
FL_DATE 3653 OP_UNIQUE_CARRIER 20 TAIL_NUM 9140 ORIGIN_AIRPORT_SEQ_ID 58 ORIGIN_CITY_MARKET_ID 20 ORIGIN 24 ORIGIN_CITY_NAME 24 DEST_AIRPORT_SEQ_ID 58 DEST_CITY_MARKET_ID 20 DEST 24 DEST_CITY_NAME 24 DEP_TIME 1440 DEP_DELAY 1783 TAXI_OUT 195 TAXI_IN 247 ARR_TIME 1440 ARR_DELAY 1806 CANCELLED 2 CANCELLATION_CODE 4 CARRIER_DELAY 1558 WEATHER_DELAY 971 NAS_DELAY 855 SECURITY_DELAY 268 LATE_AIRCRAFT_DELAY 1228 dtype: int64
In [6]:
df.dtypes
Out[6]:
FL_DATE object OP_UNIQUE_CARRIER object TAIL_NUM object ORIGIN_AIRPORT_SEQ_ID int64 ORIGIN_CITY_MARKET_ID int64 ORIGIN object ORIGIN_CITY_NAME object DEST_AIRPORT_SEQ_ID int64 DEST_CITY_MARKET_ID int64 DEST object DEST_CITY_NAME object DEP_TIME float64 DEP_DELAY float64 TAXI_OUT float64 TAXI_IN float64 ARR_TIME float64 ARR_DELAY float64 CANCELLED float64 CANCELLATION_CODE object CARRIER_DELAY float64 WEATHER_DELAY float64 NAS_DELAY float64 SECURITY_DELAY float64 LATE_AIRCRAFT_DELAY float64 dtype: object
Data Cleaning¶
In [7]:
import matplotlib.pyplot as plt
target_df = df.drop(columns = [
"OP_UNIQUE_CARRIER",
"ORIGIN_AIRPORT_SEQ_ID",
"ORIGIN_CITY_MARKET_ID",
"ORIGIN",
"ORIGIN_CITY_NAME",
"ORIGIN_AIRPORT_SEQ_ID",
"DEST_AIRPORT_SEQ_ID",
"DEST_CITY_MARKET_ID",
"DEST",
"DEST_CITY_NAME",
"DEST_AIRPORT_SEQ_ID",
"DEP_TIME",
"TAXI_OUT",
"TAXI_IN",
"ARR_TIME",
])
del df
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df.isnull().mean()
Out[7]:
FL_DATE 0.000000 TAIL_NUM 0.004830 DEP_DELAY 0.016377 ARR_DELAY 0.018966 CANCELLED 0.000000 CANCELLATION_CODE 0.983244 CARRIER_DELAY 0.801989 WEATHER_DELAY 0.801989 NAS_DELAY 0.801989 SECURITY_DELAY 0.801989 LATE_AIRCRAFT_DELAY 0.801989 dtype: float64
In [8]:
plane_set_df = target_df.TAIL_NUM.drop_duplicates()
plane_review = pd.read_csv("../supplementary/N-Number-Registration-Data-2024.csv")
plane_review['N-NUMBER'] = plane_review['N-NUMBER'].apply(lambda x:"N"+x)
plane_review = plane_review[plane_review['N-NUMBER'].isin(plane_set_df)]
plane_review
Out[8]:
| N-NUMBER | COUNTRY | TYPE AIRCRAFT | TYPE ENGINE | MFR | MODEL | NO-ENG | NO-SEATS | AC-WEIGHT | MFR_ENGINE | MODEL_ENGINE | HORSEPOWER | THRUST | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 651 | N10156 | US | Fixed wing multi engine | Turbo-fan | EMBRAER | EMB-145XR | 2 | 55 | 12,500 - 19,999 | ROLLS-ROYC | AE3007 SER | 0.0 | 6442.0 |
| 813 | N101DQ | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-211 | 2 | 199 | 20,000 and over | CFM INTL | CFM56-5B3/3 | 0.0 | 32000.0 |
| 816 | N101DU | US | Fixed wing multi engine | Turbo-fan | C SERIES AIRCRAFT LTD PTNRSP | BD-500-1A10 | 2 | 133 | 20,000 and over | P & W | PW1519G | 0.0 | 19775.0 |
| 874 | N101HQ | US | Fixed wing multi engine | Turbo-fan | EMBRAER-EMPRESA BRASILEIRA DE | ERJ 170-200 LR | 2 | 80 | 20,000 and over | GE | CF34-8E5 | 0.0 | 14510.0 |
| 946 | N101NN | US | Fixed wing multi engine | Turbo-fan | AIRBUS INDUSTRIE | A321-231 | 2 | 379 | 20,000 and over | IAE | V2500SERIES | 0.0 | 25000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 297051 | N998AN | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-231 | 2 | 379 | 20,000 and over | IAE | V2533-A5 | 0.0 | 31600.0 |
| 297052 | N998AT | US | Fixed wing multi engine | Turbo-fan | BOEING | 717-200 | 2 | 100 | 20,000 and over | ROLLS-ROYC | TAY 651-54 | 0.0 | 15400.0 |
| 297102 | N998JE | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-231 | 2 | 379 | 20,000 and over | IAE | V2533-A5 | 0.0 | 31600.0 |
| 297126 | N998NN | US | Fixed wing multi engine | Turbo-fan | BOEING | 737-800 | 2 | 175 | 20,000 and over | CFM INTL | CFM56-7B24E | 0.0 | 24200.0 |
| 297463 | N999JQ | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-231 | 2 | 379 | 20,000 and over | IAE | V2533-A5 | 0.0 | 31600.0 |
6798 rows × 13 columns
In [9]:
target_airport_df = pd.merge(
target_df,
plane_review,
how = "inner",
left_on = "TAIL_NUM",
right_on = "N-NUMBER",
suffixes = ["", "_origin"]
)
target_airport_df.drop(columns = ['N-NUMBER'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[9]:
| FL_DATE | TAIL_NUM | DEP_DELAY | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | ... | TYPE ENGINE | MFR | MODEL | NO-ENG | NO-SEATS | AC-WEIGHT | MFR_ENGINE | MODEL_ENGINE | HORSEPOWER | THRUST | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | N200AA | 3.0 | -13.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Turbo-prop | TEXTRON AVIATION INC | 208B | 1 | 12 | Up to 12,499 | P&W CANADA | PT6A-140 | 867.0 | 0.0 |
| 1 | 2014-07-01 | N200AA | 5.0 | 1.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Turbo-prop | TEXTRON AVIATION INC | 208B | 1 | 12 | Up to 12,499 | P&W CANADA | PT6A-140 | 867.0 | 0.0 |
| 2 | 2014-07-01 | N201AA | -5.0 | -3.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Reciprocating | CESSNA | 150 | 1 | 2 | Up to 12,499 | CONT MOTOR | 0-200 SERIES | 100.0 | 0.0 |
| 3 | 2014-07-01 | N201AA | -5.0 | 0.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Reciprocating | CESSNA | 150 | 1 | 2 | Up to 12,499 | CONT MOTOR | 0-200 SERIES | 100.0 | 0.0 |
| 4 | 2014-07-01 | N201AA | -1.0 | -2.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Reciprocating | CESSNA | 150 | 1 | 2 | Up to 12,499 | CONT MOTOR | 0-200 SERIES | 100.0 | 0.0 |
5 rows × 23 columns
Visualization¶
Review Performance per Airport¶
In [10]:
train_columns = [
'NO-ENG',
'NO-SEATS',
'HORSEPOWER',
'THRUST'
]
label_columns = [
"DEP_DELAY",
"ARR_DELAY",
"CARRIER_DELAY",
"WEATHER_DELAY",
"NAS_DELAY",
"SECURITY_DELAY",
"LATE_AIRCRAFT_DELAY",
]
In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
cat_columns = ['COUNTRY', 'TYPE AIRCRAFT', 'TYPE ENGINE', 'AC-WEIGHT']
num_columns = ['NO-ENG','NO-SEATS','HORSEPOWER','THRUST'] + label_columns
# Create dummy variables for categorical columns
dummy_df = pd.get_dummies(target_airport_df[cat_columns])
# Combine dummy variables with numerical columns
new_df = pd.concat([target_airport_df[num_columns], dummy_df], axis=1)
# Assuming the DataFrame is called 'df'
corr_matrix = new_df.corr(method = "spearman")
corr_matrix = corr_matrix.loc[corr_matrix.columns.difference(label_columns), label_columns]
# Create a heatmap using Seaborn
sns.set(style="white")
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title('Correlation Matrix')
plt.show()